########################
## Libraries
########################
import os
import pandas as pd
import gc
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
gc.collect()
########################
## Variables
########################
PCT_MISSING = .30 ## Define % missing
LENDING_PATH = os.path.join("data") ## Location of data files
LENDING_FILE = "loan.csv" ## Source data file name
BACKUP_FILE = "backup.csv" ## Backup data file name
########################
### Functions
### Load Data Set
########################
def load_lending_data(lending_path = LENDING_PATH, lending_file = LENDING_FILE):
csv_path = os.path.join(lending_path, lending_file)
return pd.read_csv(csv_path, low_memory=False)
### Load latest backup data set
def load_backup_data(lending_path = LENDING_PATH, backup_file = BACKUP_FILE):
csv_path = os.path.join(lending_path, backup_file)
return pd.read_csv(csv_path, low_memory=False)
## Backup current data set
def writeLendingData(lending_path = LENDING_PATH, backup_file = BACKUP_FILE):
csv_path = os.path.join(lending_path, backup_file)
lending.to_csv(csv_path, index=None, header=True)
## Drop columns
def drop_irrelevant_columns(columnList):
dfShape = lending.shape
lending.drop(columnList, axis=1, inplace=True)
print("\nBefore: rows,columns",dfShape,"\nAfter: rows,columns",lending.shape)
gc.collect()
## Let's drop any columns w/more than %30 of the data misssing for now
def remove_NaNs(pctMissing = PCT_MISSING):
dfShape = lending.shape
cols = (lending.isnull().sum()/len(lending))
cols = list(cols[cols.values>=pctMissing].index)
lending.drop(labels = cols,axis =1,inplace=True)
print("Number of Columns dropped\t: ",len(cols))
print("\nBefore: rows,columns",dfShape,"\nAfter: rows,columns",lending.shape)
### Let's determine the number of missing values where it's greater than PCT_MISSING
def showMissingValues(pctMissing = PCT_MISSING):
dfShape = lending.shape
selectedColumns = lending.isnull().sum()
return selectedColumns[selectedColumns.values > (pctMissing*len(lending))]
print("\nBefore: rows,columns",dfShape,"\nAfter: rows,columns",lending.shape)
## Plot missing values based on current PCT_MISSING
def plotMissingValues():
## Determine missing values and plot
missingValues = showMissingValues()
plt.figure(figsize=(20,4))
missingValues.plot(kind='bar')
title = 'Columns where NA > ' + str(PCT_MISSING*100) + '%'
plt.title(title)
plt.show
# Drop the rows even with single NaN or single missing values.
def dropNanRows():
dfShape = lending.shape
lending.dropna()
print("\nBefore: rows,columns",dfShape,"\nAfter: rows,columns",lending.shape)
## Remove data not available prior to loan being established.
miscColumns = ['id', 'member_id', 'emp_title', 'desc', 'zip_code', 'title',
'url', 'policy_code'
]
hardshipColumns = ['hardship_flag', 'hardship_type', 'hardship_reason',
'hardship_status', 'deferral_term', 'hardship_amount',
'hardship_start_date', 'hardship_end_date',
'payment_plan_start_date', 'hardship_length', 'hardship_dpd',
'hardship_loan_status', 'orig_projected_additional_accrued_interest',
'hardship_payoff_balance_amount', 'hardship_last_payment_amount',
'pymnt_plan'
]
settlementColumns = ['debt_settlement_flag', 'debt_settlement_flag_date',
'settlement_status','settlement_date','settlement_amount',
'settlement_percentage', 'settlement_term'
]
fundsReceviedColumns = ['total_pymnt', 'total_pymnt_inv', 'total_rec_int',
'total_rec_late_fee', 'total_rec_prncp'
]
postLoanInformation = ['out_prncp', 'out_prncp_inv']
loanPaymentColumns = ['last_pymnt_amnt', 'last_pymnt_d','next_pymnt_d',
'recoveries'
]
## Load the data set
lending = load_lending_data()
## Drop irrelevant columns that we are know of
drop_irrelevant_columns(miscColumns)
drop_irrelevant_columns(hardshipColumns)
drop_irrelevant_columns(settlementColumns)
drop_irrelevant_columns(fundsReceviedColumns)
drop_irrelevant_columns(loanPaymentColumns)
drop_irrelevant_columns(postLoanInformation)
## PCT Missing Analysis
plotMissingValues()
The attributes above have greater than 30% values missing. Most of these attributes are related to a secondary applicant which makes sense as not all applicants will have a joint or secondary applicant. Let's determine if we should remove the secondary applicant data.
lending["application_type"].value_counts().plot.pie(explode=[0,0.25], autopct='%1.2f%%',
shadow=True, labels=("Single","Joint"),
fontsize=12, startangle=70,figsize=(16,8))
## Create function to drop secondary applicant attributes
def dropJointApplicant():
jointApplicantColumns = [
'sec_app_earliest_cr_line', 'sec_app_inq_last_6mths',
'sec_app_mort_acc', 'sec_app_open_acc', 'sec_app_revol_util',
'sec_app_open_act_il', 'sec_app_num_rev_accts',
'sec_app_chargeoff_within_12_mths',
'sec_app_collections_12_mths_ex_med',
'sec_app_mths_since_last_major_derog',
'annual_inc_joint', 'dti_joint', 'revol_bal_joint',
'verification_status_joint'
]
drop_irrelevant_columns(jointApplicantColumns)
## Drop secondary applicant attributes
dropJointApplicant()
plotMissingValues()
# The number of months since the borrower's last delinquency.
lending['mths_since_last_delinq'].fillna(999, inplace=True)
# Months since most recent 90-day or worse rating
lending['mths_since_last_major_derog'].fillna(999, inplace=True)
# The number of months since the last public record.
lending['mths_since_last_record'].fillna(999, inplace=True)
# Months since most recent installment accounts opened
lending['mths_since_rcnt_il'].fillna(999, inplace=True)
# Months since most recent bankcard account opened.
lending['mths_since_recent_bc'].fillna(999, inplace=True)
# Months since most recent bankcard delinquency
lending['mths_since_recent_bc_dlq'].fillna(999, inplace=True)
# Months since most recent inquiry.
lending['mths_since_recent_inq'].fillna(999, inplace=True)
# Months since most recent revolving delinquency.
lending['mths_since_recent_revol_delinq'].fillna(999, inplace=True)
plotMissingValues()
# The number of open credit lines in the borrower's credit file.
lending['open_acc'].fillna(0, inplace=True)
# Number of open trades in last 6 months
lending['open_acc_6m'].fillna(0, inplace=True)
# Number of installment accounts opened in past 12 months
lending['open_il_12m'].fillna(0, inplace=True)
# Number of installment accounts opened in past 24 months
lending['open_il_24m'].fillna(0, inplace=True)
# Number of currently active installment trades
lending['open_act_il'].fillna(0, inplace=True)
# Number of revolving trades opened in past 12 months
lending['open_rv_12m'].fillna(0, inplace=True)
# Number of revolving trades opened in past 24 months
lending['open_rv_24m'].fillna(0, inplace=True)
# Number of credit inquiries in past 12 months
lending['inq_last_12m'].fillna(0, inplace=True)
plotMissingValues()
Let's remove rows where NaNs as these attributes may be important
## Drop the rows even with single NaN or single missing values.
dfShape = lending.shape
#lending = lending.dropna()
print("\nBefore: rows,columns",dfShape,"\nAfter: rows,columns",lending.shape)
## Check point save data to CSV.
#writeLendingData()
import missingno as msno
msno.matrix(lending)
Lending Club's generate revenue by collecting interest payements on loans to its consumers. The goal with any loan is to have it fully paid back overtime. Loans that go in default reduces forecasted revenues which impacts investors that are matched with the borrower. The loan_status column was utilized to determine what percentage of loans were bad. With the data given, can we come up with an algorithm to better predict borrowers who will either default on would have a higher probability of defaulting on their loans.
First, let's determine from the loan_status columns which loans should be considered bad risk or good risk.
extract = lending["loan_status"].value_counts()
explodeTuple = (0.1, 0.1, 0.1, 0.1, 0.1, 0.3, 0.5, 0.9, 1.3)
fig, ax = plt.subplots(figsize=(16, 10), subplot_kw=dict(aspect="equal"))
ax.pie(extract.values, explode=explodeTuple, labels=extract.index, autopct='%1.1f%%',
shadow=True, startangle=90)
ax.axis('equal') # Equal aspect ratio ensures that pie is drawn as a circle.
plt.show()
extract
## Define a good or bad risk
goodLoan = ['Current', 'Fully Paid','Does not meet the credit policy. Status:Fully Paid']
# Use risk definition to define new column
def determineLoanRisk (loan_status_):
if loan_status_ in goodLoan:
return 'good'
else:
return 'bad'
# Create new column risk which tracks whether this was a good or bad risk.
lending['risk'] = lending['loan_status'].apply(determineLoanRisk)
## Plot the ratio of good risk to bad risk
extract = lending["risk"].value_counts()
explodeTuple = (0.0, 0.3)
fig, ax = plt.subplots(figsize=(16, 10), subplot_kw=dict(aspect="equal"))
ax.pie(extract.values, explode=explodeTuple, labels=extract.index, autopct='%1.1f%%',
shadow=True, startangle=90)
ax.axis('equal') # Equal aspect ratio ensures that pie is drawn as a circle.
plt.show()
############################################################
## Save Current DataSet before running EDA
##
## Saving the current dataset gives us the opportunity
## to jump right into the EDA or model building without
## running all of the data prep code.'''
############################################################
BACKUP_FILE = "preEDA.csv"
writeLendingData()
BACKUP_FILE = "preEDA.csv"
#lending = load_backup_data()
import pandas_profiling as pp
pp.ProfileReport(lending)